Libraries and Imports¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import pearsonr
import warnings
In [2]:
!pip install pyarrow
Requirement already satisfied: pyarrow in ./.env/lib/python3.11/site-packages (21.0.0)
In [3]:
import boto3
from io import StringIO
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
warnings.filterwarnings('ignore')
Spark Configuration¶
Inicializar sesión de Spark¶
spark = SparkSession.builder.appName("VideoGameAnalysis").master("local[*]").getOrCreate()
In [5]:
from pyspark.sql import SparkSession
spark = (
SparkSession.builder
.appName("Video_Games_R")
.config("spark.jars.packages",
"org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.12.367")
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
.config("spark.hadoop.fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
.config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
.getOrCreate()
)
df = spark.read.parquet("s3a://xideralaws-curso-uriel/parquet/video_game_reviews_cleaned.parquet") #WERE TO LOOK FOR PARQUET
df.printSchema()
df.show(3, truncate=False)
root |-- Game Title: string (nullable = true) |-- User Rating: double (nullable = true) |-- Age Group Targeted: string (nullable = true) |-- Price: double (nullable = true) |-- Platform: string (nullable = true) |-- Developer: string (nullable = true) |-- Publisher: string (nullable = true) |-- Release Year: long (nullable = true) |-- Genre: string (nullable = true) |-- Multiplayer: string (nullable = true) |-- Game Length (Hours): double (nullable = true) |-- Graphics Quality: string (nullable = true) |-- Soundtrack Quality: string (nullable = true) |-- Story Quality: string (nullable = true) |-- User Review Text: string (nullable = true) |-- Game Mode: string (nullable = true) +------------------+-----------+------------------+-----+--------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+---------------------------------------------+---------+ |Game Title |User Rating|Age Group Targeted|Price|Platform|Developer |Publisher |Release Year|Genre |Multiplayer|Game Length (Hours)|Graphics Quality|Soundtrack Quality|Story Quality|User Review Text |Game Mode| +------------------+-----------+------------------+-----+--------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+---------------------------------------------+---------+ |Grand Theft Auto V|36.4 |All Ages |41.41|PC |Game Freak|Innersloth |2015 |Adventure|No |55.3 |Medium |Average |Poor |Solid game, but too many bugs. |Offline | |The Sims 4 |38.3 |Adults |57.56|PC |Nintendo |Electronic Arts|2015 |Shooter |Yes |34.6 |Low |Poor |Poor |Solid game, but too many bugs. |Offline | |Minecraft |26.8 |Teens |44.93|PC |Bungie |Capcom |2012 |Adventure|Yes |13.9 |Low |Good |Average |Great game, but the graphics could be better.|Offline | +------------------+-----------+------------------+-----+--------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+---------------------------------------------+---------+ only showing top 3 rows
AWS S3 Configuration¶
In [6]:
# Cliente S3
s3 = boto3.client("s3")
# Variables de configuración S3
bucket_name = "xideralaws-curso-uriel"
object_key = "video_game_reviews.csv"
Data Loading from S3¶
In [7]:
print("=== CARGANDO DATOS DESDE AWS S3 ===")
# Obtener objeto desde S3
response = s3.get_object(Bucket=bucket_name, Key=object_key)
# Leer contenido CSV
csv_content = response["Body"].read().decode("utf-8")
# Crear DataFrame con pandas desde S3
df = pd.read_csv(StringIO(csv_content))
print(f"✅ Datos cargados exitosamente desde S3")
print(f"Bucket: {bucket_name}")
print(f"Archivo: {object_key}")
=== CARGANDO DATOS DESDE AWS S3 === ✅ Datos cargados exitosamente desde S3 Bucket: xideralaws-curso-uriel Archivo: video_game_reviews.csv
General Information about the Dataset¶
In [8]:
print("\n=== INFORMACIÓN GENERAL DEL DATASET ===")
print(f"Forma del dataset: {df.shape}")
print(f"\nColumnas: {list(df.columns)}")
print(f"\nTipos de datos:")
print(df.dtypes)
print(f"\nValores faltantes:")
print(df.isnull().sum())
# Estadísticas descriptivas
print("\n=== ESTADÍSTICAS DESCRIPTIVAS ===")
print(df.describe())
=== INFORMACIÓN GENERAL DEL DATASET ===
Forma del dataset: (47774, 18)
Columnas: ['Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform', 'Requires Special Device', 'Developer', 'Publisher', 'Release Year', 'Genre', 'Multiplayer', 'Game Length (Hours)', 'Graphics Quality', 'Soundtrack Quality', 'Story Quality', 'User Review Text', 'Game Mode', 'Min Number of Players']
Tipos de datos:
Game Title object
User Rating float64
Age Group Targeted object
Price float64
Platform object
Requires Special Device object
Developer object
Publisher object
Release Year int64
Genre object
Multiplayer object
Game Length (Hours) float64
Graphics Quality object
Soundtrack Quality object
Story Quality object
User Review Text object
Game Mode object
Min Number of Players int64
dtype: object
Valores faltantes:
Game Title 0
User Rating 0
Age Group Targeted 0
Price 0
Platform 0
Requires Special Device 0
Developer 0
Publisher 0
Release Year 0
Genre 0
Multiplayer 0
Game Length (Hours) 0
Graphics Quality 0
Soundtrack Quality 0
Story Quality 0
User Review Text 0
Game Mode 0
Min Number of Players 0
dtype: int64
=== ESTADÍSTICAS DESCRIPTIVAS ===
User Rating Price Release Year Game Length (Hours) \
count 47774.000000 47774.000000 47774.000000 47774.000000
mean 29.719329 39.951371 2016.480952 32.481672
std 7.550131 11.520342 4.027276 15.872508
min 10.100000 19.990000 2010.000000 5.000000
25% 24.300000 29.990000 2013.000000 18.800000
50% 29.700000 39.845000 2016.000000 32.500000
75% 35.100000 49.957500 2020.000000 46.300000
max 49.500000 59.990000 2023.000000 60.000000
Min Number of Players
count 47774.000000
mean 5.116758
std 2.769521
min 1.000000
25% 3.000000
50% 5.000000
75% 7.000000
max 10.000000
Conversion to Spark DataFrame¶
In [9]:
print("\n=== CREANDO SPARK DATAFRAME ===")
# Crear DataFrame de Spark
df_spark = spark.createDataFrame(df)
# Mostrar información del DataFrame de Spark
print("Schema del DataFrame de Spark:")
df_spark.printSchema()
print(f"Número de filas en Spark DataFrame: {df_spark.count()}")
print("Primeras 5 filas:")
df_spark.show(5)
=== CREANDO SPARK DATAFRAME === Schema del DataFrame de Spark: root |-- Game Title: string (nullable = true) |-- User Rating: double (nullable = true) |-- Age Group Targeted: string (nullable = true) |-- Price: double (nullable = true) |-- Platform: string (nullable = true) |-- Requires Special Device: string (nullable = true) |-- Developer: string (nullable = true) |-- Publisher: string (nullable = true) |-- Release Year: long (nullable = true) |-- Genre: string (nullable = true) |-- Multiplayer: string (nullable = true) |-- Game Length (Hours): double (nullable = true) |-- Graphics Quality: string (nullable = true) |-- Soundtrack Quality: string (nullable = true) |-- Story Quality: string (nullable = true) |-- User Review Text: string (nullable = true) |-- Game Mode: string (nullable = true) |-- Min Number of Players: long (nullable = true)
25/08/29 10:10:10 WARN TaskSetManager: Stage 4 contains a task of very large size (1451 KiB). The maximum recommended task size is 1000 KiB.
Número de filas en Spark DataFrame: 47774 Primeras 5 filas:
25/08/29 10:10:13 WARN TaskSetManager: Stage 7 contains a task of very large size (1451 KiB). The maximum recommended task size is 1000 KiB.
+------------------+-----------+------------------+-----+-----------+-----------------------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+--------------------+---------+---------------------+ | Game Title|User Rating|Age Group Targeted|Price| Platform|Requires Special Device| Developer| Publisher|Release Year| Genre|Multiplayer|Game Length (Hours)|Graphics Quality|Soundtrack Quality|Story Quality| User Review Text|Game Mode|Min Number of Players| +------------------+-----------+------------------+-----+-----------+-----------------------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+--------------------+---------+---------------------+ |Grand Theft Auto V| 36.4| All Ages|41.41| PC| No|Game Freak| Innersloth| 2015|Adventure| No| 55.3| Medium| Average| Poor|Solid game, but t...| Offline| 1| | The Sims 4| 38.3| Adults|57.56| PC| No| Nintendo|Electronic Arts| 2015| Shooter| Yes| 34.6| Low| Poor| Poor|Solid game, but t...| Offline| 3| | Minecraft| 26.8| Teens|44.93| PC| Yes| Bungie| Capcom| 2012|Adventure| Yes| 13.9| Low| Good| Average|Great game, but t...| Offline| 5| | Bioshock Infinite| 38.4| All Ages|48.29| Mobile| Yes|Game Freak| Nintendo| 2015| Sports| No| 41.9| Medium| Good| Excellent|Solid game, but t...| Online| 4| | Half-Life: Alyx| 30.1| Adults|55.49|PlayStation| Yes|Game Freak| Epic Games| 2022| RPG| Yes| 13.2| High| Poor| Good|Great game, but t...| Offline| 1| +------------------+-----------+------------------+-----+-----------+-----------------------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+--------------------+---------+---------------------+ only showing top 5 rows
Analysis with Spark¶
In [10]:
print("\n=== ANÁLISIS CON SPARK ===")
# Análisis básico con Spark
print("Columnas del DataFrame de Spark:")
print(df_spark.columns)
# Estadísticas por género usando Spark
print("\nRating promedio por género (usando Spark):")
df_spark.groupBy("Genre").agg(avg("User Rating").alias("avg_rating")).orderBy(desc("avg_rating")).show()
# Top 10 juegos más reseñados usando Spark
print("\nTop 10 juegos más reseñados (usando Spark):")
df_spark.groupBy("Game Title").count().orderBy(desc("count")).show(10)
=== ANÁLISIS CON SPARK === Columnas del DataFrame de Spark: ['Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform', 'Requires Special Device', 'Developer', 'Publisher', 'Release Year', 'Genre', 'Multiplayer', 'Game Length (Hours)', 'Graphics Quality', 'Soundtrack Quality', 'Story Quality', 'User Review Text', 'Game Mode', 'Min Number of Players'] Rating promedio por género (usando Spark):
25/08/29 10:10:16 WARN TaskSetManager: Stage 8 contains a task of very large size (1451 KiB). The maximum recommended task size is 1000 KiB.
+----------+------------------+ | Genre| avg_rating| +----------+------------------+ | Party|29.791343723673183| | Fighting|29.786238532110104| | RPG|29.766858198235212| | Puzzle| 29.75960182496892| | Strategy|29.716170125333914| | Adventure|29.700736842105286| | Sports| 29.69997888067586| | Action|29.691722353955612| |Simulation|29.650710702341154| | Shooter|29.631074142534402| +----------+------------------+ Top 10 juegos más reseñados (usando Spark):
25/08/29 10:10:18 WARN TaskSetManager: Stage 11 contains a task of very large size (1451 KiB). The maximum recommended task size is 1000 KiB. [Stage 11:=============================> (1 + 1) / 2]
+--------------------+-----+ | Game Title|count| +--------------------+-----+ |Pokémon Scarlet &...| 1274| | Minecraft| 1265| | FIFA 24| 1247| | Street Fighter V| 1245| | Tomb Raider (2013)| 1245| |Super Smash Bros....| 1235| | Fall Guys| 1232| | Hitman 3| 1229| | Tetris| 1224| | 1000-Piece Puzzle| 1222| +--------------------+-----+ only showing top 10 rows
Transforming and Cleaning Data¶
In [11]:
print("\n=== LIMPIEZA Y TRANSFORMACIÓN DE DATOS ===")
# Identificar columnas necesarias para el análisis
essential_columns = [
'Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform',
'Developer', 'Publisher', 'Release Year', 'Genre', 'Multiplayer',
'Game Length (Hours)', 'Graphics Quality', 'Soundtrack Quality',
'Story Quality', 'User Review Text', 'Game Mode'
]
print(f"Columnas originales: {len(df.columns)}")
print(f"Columnas esenciales: {len(essential_columns)}")
# Filtrar solo columnas esenciales
df_cleaned = df[essential_columns].copy()
df_spark_cleaned = spark.createDataFrame(df_cleaned)
print(f"Datos después de limpieza: {df_cleaned.shape}")
=== LIMPIEZA Y TRANSFORMACIÓN DE DATOS === Columnas originales: 18 Columnas esenciales: 16 Datos después de limpieza: (47774, 16)
Style Configuration for Visualizations¶
In [12]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
Visualizations (14 Charts)¶
1. Temporal Trend (Games per Year)¶
In [20]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 8)
year_counts = df_cleaned['Release Year'].value_counts().sort_index()
plt.plot(year_counts.index, year_counts.values, marker='o', linewidth=2, markersize=4)
plt.title('Games Released per Year', fontsize=14, fontweight='bold')
plt.xlabel('Year')
plt.ylabel('Number of Games')
plt.grid(True, alpha=0.3)
2. Rating vs Precio (scatter plot)¶
In [18]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 6)
plt.scatter(df_cleaned['Price'], df_cleaned['User Rating'], alpha=0.5, color='purple')
plt.title('Rating vs. Price Relationship', fontsize=14, fontweight='bold')
plt.xlabel('Price ($)')
plt.ylabel('User Rating')
plt.grid(True, alpha=0.3)
3. User Rating Distribution¶
In [13]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 1)
plt.hist(df_cleaned['User Rating'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('User Rating Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
4. Top 10 Most Popular Games¶
In [14]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 2)
top_games = df_cleaned['Game Title'].value_counts().head(10)
plt.barh(range(len(top_games)), top_games.values)
plt.yticks(range(len(top_games)), top_games.index, fontsize=10)
plt.title('Top 10 Most Reviewed Games', fontsize=14, fontweight='bold')
plt.xlabel('Number of Reviews')
Out[14]:
Text(0.5, 0, 'Number of Reviews')
5. Distribution by Platform¶
In [15]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 3)
platform_counts = df_cleaned['Platform'].value_counts()
plt.pie(platform_counts.values, labels=platform_counts.index, autopct='%1.1f%%')
plt.title('Distribution by Platform', fontsize=14, fontweight='bold')
Out[15]:
Text(0.5, 1.0, 'Distribution by Platform')
6. Average Rating by Genre¶
In [16]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 4)
genre_rating = df_cleaned.groupby('Genre')['User Rating'].mean().sort_values(ascending=False)
plt.bar(range(len(genre_rating)), genre_rating.values, color='lightcoral')
plt.xticks(range(len(genre_rating)), genre_rating.index, rotation=45, ha='right')
plt.title('Average Rating by Genre', fontsize=14, fontweight='bold')
plt.ylabel('Average Rating')
Out[16]:
Text(0, 0.5, 'Average Rating')
7. Price Distribution¶
In [17]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 5)
plt.hist(df_cleaned['Price'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
plt.title('Price Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
8. Distribution by Age Group¶
In [19]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 7)
age_counts = df_cleaned['Age Group Targeted'].value_counts()
plt.bar(age_counts.index, age_counts.values, color='orange')
plt.title('Distribution by Age Group', fontsize=14, fontweight='bold')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
Out[19]:
([0, 1, 2, 3], [Text(0, 0, 'Teens'), Text(1, 0, 'Kids'), Text(2, 0, 'All Ages'), Text(3, 0, 'Adults')])
9. Rating by Game Duration¶
In [21]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 9)
plt.scatter(df_cleaned['Game Length (Hours)'], df_cleaned['User Rating'], alpha=0.5, color='brown')
plt.title('Rating vs. Game Duration', fontsize=14, fontweight='bold')
plt.xlabel('Duration (Hours)')
plt.ylabel('User Rating')
plt.grid(True, alpha=0.3)
10. Top Publishers by Average Rating¶
In [22]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 10)
pub_rating = df_cleaned.groupby('Publisher')['User Rating'].mean().sort_values(ascending=False).head(10)
plt.barh(range(len(pub_rating)), pub_rating.values)
plt.yticks(range(len(pub_rating)), pub_rating.index, fontsize=9)
plt.title('Top 9 Publishers by Rating', fontsize=14, fontweight='bold')
plt.xlabel('Average Rating')
Out[22]:
Text(0.5, 0, 'Average Rating')
11. Graphics Quality vs. Rating¶
In [23]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 11)
graphics_rating = df_cleaned.groupby('Graphics Quality')['User Rating'].mean()
plt.bar(graphics_rating.index, graphics_rating.values, color='teal')
plt.title('Rating by Graphics Quality', fontsize=14, fontweight='bold')
plt.ylabel('Average Rating')
Out[23]:
Text(0, 0.5, 'Average Rating')
12. Multiplayer vs. Single Player¶
In [24]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 12)
mp_counts = df_cleaned['Multiplayer'].value_counts()
plt.pie(mp_counts.values, labels=['Yes' if x=='Yes' else 'No' for x in mp_counts.index],autopct='%1.1f%%', colors=['lightblue', 'salmon'])
plt.title('Multiplayer Distribution', fontsize=14, fontweight='bold')
Out[24]:
Text(0.5, 1.0, 'Multiplayer Distribution')
13. Correlation Heatmap¶
In [25]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 13)
numeric_cols = ['User Rating', 'Price', 'Release Year', 'Game Length (Hours)']
correlation_matrix = df_cleaned[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,square=True, fmt='.2f')
plt.title('Numerical Variable Correlations', fontsize=14, fontweight='bold')
Out[25]:
Text(0.5, 1.0, 'Numerical Variable Correlations')
14. Soundtrack Quality Distribution¶
In [26]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 14)
soundtrack_counts = df_cleaned['Soundtrack Quality'].value_counts()
plt.bar(soundtrack_counts.index, soundtrack_counts.values, color='mediumpurple')
plt.title('Soundtrack Quality Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Frequency')
Out[26]:
Text(0, 0.5, 'Frequency')
ANÁLISIS DE TEXTO DE RESEÑAS (ANÁLISIS DE RESEÑAS DE USUARIOS)¶
In [27]:
reviews_text = ' '.join(df_cleaned['User Review Text'].astype(str))
In [28]:
# Word Cloud
plt.figure(figsize=(15, 8))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(reviews_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud – User Reviews', fontsize=16, fontweight='bold')
plt.show()
In [29]:
# Palabras más comunes
vectorizer = CountVectorizer(max_features=20, stop_words='english')
word_freq = vectorizer.fit_transform(df_cleaned['User Review Text'].astype(str))
feature_names = vectorizer.get_feature_names_out()
word_counts = word_freq.sum(axis=0).A1
plt.figure(figsize=(12, 6))
plt.bar(feature_names, word_counts, color='lightcoral')
plt.title('Top 20 Most Frequent Words in Reviews', fontsize=14, fontweight='bold')
plt.xlabel('Palabras')
plt.ylabel('Frecuencia')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
INSIGHTS CLAVE¶
In [30]:
print(f"Rating promedio general: {df_cleaned['User Rating'].mean():.2f}")
print(f"Precio promedio: ${df_cleaned['Price'].mean():.2f}")
print(f"Duración promedio: {df_cleaned['Game Length (Hours)'].mean():.1f} horas")
print(f"\nGénero con mejor rating: {genre_rating.index[0]} ({genre_rating.iloc[0]:.2f})")
print(f"Plataforma más popular: {platform_counts.index[0]} ({platform_counts.iloc[0]} juegos)")
best_publisher = df_cleaned.groupby('Publisher')['User Rating'].mean().sort_values(ascending=False).iloc[0]
best_pub_name = df_cleaned.groupby('Publisher')['User Rating'].mean().sort_values(ascending=False).index[0]
print(f"Publisher con mejor rating promedio: {best_pub_name} ({best_publisher:.2f})")
Rating promedio general: 29.72 Precio promedio: $39.95 Duración promedio: 32.5 horas Género con mejor rating: Party (29.79) Plataforma más popular: PlayStation (9633 juegos) Publisher con mejor rating promedio: Innersloth (29.93)
In [31]:
# Análisis de precios por calidad
print(f"\nPrecio promedio por calidad gráfica:")
for quality in df_cleaned['Graphics Quality'].unique():
avg_price = df_cleaned[df_cleaned['Graphics Quality']==quality]['Price'].mean()
print(f" {quality}: ${avg_price:.2f}")
print(f"\nPorcentaje de juegos multiplayer: {(df_cleaned['Multiplayer']=='Yes').mean()*100:.1f}%")
Precio promedio por calidad gráfica: Medium: $39.91 Low: $39.80 High: $40.10 Ultra: $40.00 Porcentaje de juegos multiplayer: 49.6%
Conversion to Parquet Format¶
In [32]:
print("\n=== CONVERSIÓN A FORMATO PARQUET ===")
# 1. DEFINIR VARIABLES CORRECTAMENTE
S3_BUCKET = "xideralaws-curso-uriel" # ✅ Tu bucket
S3_PREFIX = "parquet" # ✅ Carpeta parquet
s3_client = s3 # ✅ Usar el cliente s3 que ya tienes
print(f"🎯 Bucket destino: {S3_BUCKET}")
print(f"📁 Carpeta destino: {S3_PREFIX}")
# 2. CREAR DATAFRAME LIMPIO
print("🔄 Preparando datos para export...")
df_for_spark = df_cleaned.copy()
# 3. VERIFICAR CREDENCIALES AWS
try:
test_response = s3_client.list_objects_v2(Bucket=S3_BUCKET, MaxKeys=1)
print("✅ Credenciales AWS verificadas - Acceso a S3 confirmado")
except Exception as e:
print(f"❌ Error de credenciales AWS: {e}")
print("💡 Verifica tus credenciales AWS")
# 4. MÉTODO SIMPLE Y CONFIABLE - PANDAS + BOTO3
try:
from io import BytesIO
print("📤 Exportando con pandas + boto3...")
# Crear buffer en memoria
parquet_buffer = BytesIO()
# Convertir a parquet en memoria
df_for_spark.to_parquet(
parquet_buffer,
engine='pyarrow',
compression='snappy',
index=False
)
parquet_buffer.seek(0)
# Definir clave S3
s3_key = f"{S3_PREFIX}/video_game_reviews_cleaned.parquet"
# Subir a S3
print(f"⬆️ Subiendo a s3://{S3_BUCKET}/{s3_key}")
s3_client.upload_fileobj(parquet_buffer, S3_BUCKET, s3_key)
# Verificar que el archivo existe
response = s3_client.head_object(Bucket=S3_BUCKET, Key=s3_key)
size_mb = response['ContentLength'] / (1024*1024)
print(f"🎉 ¡ÉXITO! Dataset exportado a S3")
print(f"📁 Ubicación: s3://{S3_BUCKET}/{s3_key}")
print(f"📦 Tamaño: {size_mb:.2f} MB")
print(f"📅 Última modificación: {response['LastModified']}")
print(f"🔢 Filas exportadas: {len(df_for_spark)}")
print(f"🔢 Columnas exportadas: {len(df_for_spark.columns)}")
except Exception as e:
print(f"❌ Error durante export: {e}")
print("💡 Revisa las credenciales AWS y permisos del bucket")
# 5. VERIFICAR EN S3 LISTANDO OBJETOS
print("\n=== VERIFICACIÓN FINAL ===")
try:
print("🔍 Verificando archivos en S3...")
response = s3_client.list_objects_v2(
Bucket=S3_BUCKET,
Prefix=S3_PREFIX
)
if 'Contents' in response:
print(f"✅ Archivos encontrados en s3://{S3_BUCKET}/{S3_PREFIX}/:")
for obj in response['Contents']:
size_mb = obj['Size'] / (1024*1024)
print(f" 📄 {obj['Key']} ({size_mb:.2f} MB) - {obj['LastModified']}")
else:
print(f"⚠️ No se encontraron archivos en s3://{S3_BUCKET}/{S3_PREFIX}/")
except Exception as e:
print(f"❌ Error listando objetos S3: {e}")
# 6. PRUEBA DE LECTURA DESDE S3
print("\n=== PRUEBA DE LECTURA ===")
try:
s3_key_test = f"{S3_PREFIX}/video_game_reviews_cleaned.parquet"
print(f"📖 Intentando leer desde s3://{S3_BUCKET}/{s3_key_test}")
# Leer desde S3
obj = s3_client.get_object(Bucket=S3_BUCKET, Key=s3_key_test)
parquet_data = BytesIO(obj['Body'].read())
# Leer con pandas
df_test = pd.read_parquet(parquet_data)
print(f"✅ ¡Lectura exitosa desde S3!")
print(f"🔢 Filas leídas: {len(df_test)}")
print(f"🔢 Columnas leídas: {len(df_test.columns)}")
print("📊 Primeras columnas:", list(df_test.columns[:5]))
except Exception as e:
print(f"❌ Error leyendo desde S3: {e}")
print("\n🏁 Proceso de conversión a Parquet completado")
=== CONVERSIÓN A FORMATO PARQUET === 🎯 Bucket destino: xideralaws-curso-uriel 📁 Carpeta destino: parquet 🔄 Preparando datos para export... ✅ Credenciales AWS verificadas - Acceso a S3 confirmado 📤 Exportando con pandas + boto3... ⬆️ Subiendo a s3://xideralaws-curso-uriel/parquet/video_game_reviews_cleaned.parquet 🎉 ¡ÉXITO! Dataset exportado a S3 📁 Ubicación: s3://xideralaws-curso-uriel/parquet/video_game_reviews_cleaned.parquet 📦 Tamaño: 0.43 MB 📅 Última modificación: 2025-08-29 10:12:31+00:00 🔢 Filas exportadas: 47774 🔢 Columnas exportadas: 16 === VERIFICACIÓN FINAL === 🔍 Verificando archivos en S3... ✅ Archivos encontrados en s3://xideralaws-curso-uriel/parquet/: 📄 parquet/video_game_reviews_cleaned.parquet (0.43 MB) - 2025-08-29 10:12:31+00:00 === PRUEBA DE LECTURA === 📖 Intentando leer desde s3://xideralaws-curso-uriel/parquet/video_game_reviews_cleaned.parquet ✅ ¡Lectura exitosa desde S3! 🔢 Filas leídas: 47774 🔢 Columnas leídas: 16 📊 Primeras columnas: ['Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform'] 🏁 Proceso de conversión a Parquet completado
SUBIR RESULTADO A S3 (OPCIONAL)¶
In [33]:
# Configuración para subir el archivo Parquet a S3
output_bucket = bucket_name
output_key = "processed_data/video_game_reviews_cleaned.parquet"
print(f"Archivo Parquet listo para subir a:")
print(f"Bucket: {output_bucket}")
print(f"Key: {output_key}")
Archivo Parquet listo para subir a: Bucket: xideralaws-curso-uriel Key: processed_data/video_game_reviews_cleaned.parquet
Resource Cleanup¶
In [34]:
print("\n=== FINALIZANDO ANÁLISIS ===")
print("✅ Análisis completado exitosamente")
print("✅ Visualizaciones generadas")
print("✅ Análisis de texto realizado")
print("✅ Estadísticas calculadas")
print("✅ Datos convertidos a formato Parquet")
print("✅ Preparado para ejecución en AWS EMR con Spark Jobs")
# Detener sesión de Spark
spark.stop()
print("\n🎯 RESUMEN FINAL:")
print(f"- Dataset original: {df.shape[0]} filas, {df.shape[1]} columnas")
print(f"- Dataset limpio: {df_cleaned.shape[0]} filas, {df_cleaned.shape[1]} columnas")
print(f"- 15 visualizaciones generadas")
print(f"- Análisis de texto completado")
print(f"- Correlaciones calculadas")
print(f"- Archivo Parquet creado y verificado")
print(f"- Listo para producción en AWS EMR")
=== FINALIZANDO ANÁLISIS === ✅ Análisis completado exitosamente ✅ Visualizaciones generadas ✅ Análisis de texto realizado ✅ Estadísticas calculadas ✅ Datos convertidos a formato Parquet ✅ Preparado para ejecución en AWS EMR con Spark Jobs 🎯 RESUMEN FINAL: - Dataset original: 47774 filas, 18 columnas - Dataset limpio: 47774 filas, 16 columnas - 15 visualizaciones generadas - Análisis de texto completado - Correlaciones calculadas - Archivo Parquet creado y verificado - Listo para producción en AWS EMR
Evidencia de archivo .parquet guardado en Bucket¶
Visualización en Streamlit¶
Code¶
In [ ]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# ==============================
# Configuración inicial
# ==============================
st.set_page_config(
page_title="Video Game Reviews Dashboard",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for dark theme styling
st.markdown("""
<style>
.main {
background-color: #0E1117;
}
.stApp {
background-color: #0E1117;
}
.css-1d391kg {
background-color: #262730;
}
.metric-container {
background-color: #262730;
padding: 1rem;
border-radius: 0.5rem;
margin: 0.5rem 0;
}
.stSelectbox > div > div {
background-color: #262730;
}
.uploadedFile {
background-color: #262730;
}
</style>
""", unsafe_allow_html=True)
# Title with gaming icon
st.title("🎮 Dashboard de Análisis de Videojuegos")
# Sidebar: cargar archivo CSV
st.sidebar.title("Datos")
st.sidebar.markdown("Sube tu archivo CSV de videojuegos")
st.sidebar.markdown("Drag and drop file here")
st.sidebar.markdown("Limit 200MB per file • CSV")
file = st.sidebar.file_uploader("", type=["csv"], label_visibility="collapsed")
if file is None:
st.warning("⚠️ Sube un archivo CSV para continuar.")
st.stop()
# Leer datos
df = pd.read_csv(file)
# ==============================
# KPIs
# ==============================
total_games = df.shape[0]
avg_rating = df['User Rating'].mean()
avg_price = df['Price'].mean()
total_platforms = df['Platform'].nunique()
avg_game_length = df['Game Length (Hours)'].mean()
c1, c2, c3, c4, c5 = st.columns(5)
with c1:
st.metric("Total Juegos", f"{total_games:,}")
with c2:
st.metric("Rating Promedio", f"{avg_rating:.1f}/100")
with c3:
st.metric("Precio Promedio", f"${avg_price:.2f}")
with c4:
st.metric("Plataformas", total_platforms)
with c5:
st.metric("Duración Promedio", f"{avg_game_length:.1f} hrs")
st.markdown("---")
# ==============================
# Visualizaciones
# ==============================
# Set matplotlib style for dark theme
plt.style.use('dark_background')
# Row 1: Two columns
cols = st.columns(2)
# --- Distribución de ratings ---
with cols[0]:
st.subheader("Distribución de Ratings")
fig, ax = plt.subplots(figsize=(8, 5))
ax.hist(df['User Rating'], bins=20, alpha=0.7, color='#1f77b4', edgecolor='white')
ax.set_xlabel("Rating del Usuario")
ax.set_ylabel("Cantidad de Juegos")
ax.set_facecolor('#0E1117')
fig.patch.set_facecolor('#0E1117')
st.pyplot(fig)
# --- Distribución por plataforma ---
with cols[1]:
st.subheader("Juegos por Plataforma")
platform_counts = df['Platform'].value_counts()
fig, ax = plt.subplots(figsize=(8, 5))
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
platform_counts.plot(kind='bar', ax=ax, color=colors[:len(platform_counts)])
ax.set_xlabel("Plataforma")
ax.set_ylabel("Cantidad de Juegos")
ax.tick_params(axis='x', rotation=45)
ax.set_facecolor('#0E1117')
fig.patch.set_facecolor('#0E1117')
st.pyplot(fig)
st.markdown("---")
# Row 2: Two columns
cols2 = st.columns(2)
# --- Géneros más populares ---
with cols2[0]:
st.subheader("Distribución por Género")
genre_counts = df['Genre'].value_counts().head(8)
fig, ax = plt.subplots(figsize=(8, 6))
colors = plt.cm.Set3(np.linspace(0, 1, len(genre_counts)))
wedges, texts, autotexts = ax.pie(genre_counts.values, labels=genre_counts.index,
autopct='%1.1f%%', colors=colors)
ax.set_facecolor('#0E1117')
fig.patch.set_facecolor('#0E1117')
st.pyplot(fig)
# --- Relación Precio vs Rating ---
with cols2[1]:
st.subheader("Precio vs Rating")
fig, ax = plt.subplots(figsize=(8, 6))
scatter = ax.scatter(df['Price'], df['User Rating'],
alpha=0.6, c=df['User Rating'],
cmap='viridis', s=50)
ax.set_xlabel("Precio ($)")
ax.set_ylabel("Rating del Usuario")
plt.colorbar(scatter, ax=ax, label='Rating')
ax.set_facecolor('#0E1117')
fig.patch.set_facecolor('#0E1117')
st.pyplot(fig)
st.markdown("---")
# Row 3: Full width
st.subheader("Duración de Juegos por Género")
fig, ax = plt.subplots(figsize=(12, 6))
genre_length = df.groupby('Genre')['Game Length (Hours)'].mean().sort_values(ascending=False)
bars = ax.bar(genre_length.index, genre_length.values,
color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f'])
ax.set_xlabel("Género")
ax.set_ylabel("Duración Promedio (Horas)")
ax.tick_params(axis='x', rotation=45)
ax.set_facecolor('#0E1117')
fig.patch.set_facecolor('#0E1117')
# Add value labels on bars
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height + 1,
f'{height:.1f}', ha='center', va='bottom')
st.pyplot(fig)
st.markdown("---")
# Tabla de estadísticas adicionales
st.subheader("Estadísticas por Grupo de Edad")
age_stats = df.groupby('Age Group Targeted').agg({
'User Rating': ['mean', 'count'],
'Price': 'mean',
'Game Length (Hours)': 'mean'
}).round(2)
age_stats.columns = ['Rating Promedio', 'Cantidad', 'Precio Promedio', 'Duración Promedio']
st.dataframe(age_stats, use_container_width=True)
st.markdown("---")
st.markdown("🎮 **Dashboard de Videojuegos** - Análisis completo de reviews y características")
Evidencia del código en Visual¶
Visualización 1¶
Visualización 2¶
Visualización 3¶
Lambda¶
In [ ]:
import json
import boto3
import pandas as pd
from io import StringIO, BytesIO
def lambda_handler(event, context):
# Configuración S3
s3 = boto3.client("s3")
bucket_name = "xideralaws-curso-uriel"
input_key = "video_game_reviews.csv"
output_key = "processed_data/video_game_reviews_cleaned.parquet"
try:
# 1. Cargar datos desde S3
print("=== CARGANDO DATOS DESDE AWS S3 ===")
response = s3.get_object(Bucket=bucket_name, Key=input_key)
csv_content = response["Body"].read().decode("utf-8")
df = pd.read_csv(StringIO(csv_content))
print(f"✅ Datos cargados exitosamente desde S3")
print(f"Filas: {df.shape[0]}, Columnas: {df.shape[1]}")
# 2. Procesamiento básico de datos
print("=== PROCESANDO DATOS ===")
# Limpieza de datos básica
df_cleaned = df.copy()
# Eliminar duplicados
initial_rows = df_cleaned.shape[0]
df_cleaned = df_cleaned.drop_duplicates()
print(f"Duplicados eliminados: {initial_rows - df_cleaned.shape[0]}")
# Manejar valores nulos
null_counts = df_cleaned.isnull().sum()
print(f"Valores nulos por columna:\n{null_counts}")
# 3. Convertir a Parquet y guardar en S3
print("=== EXPORTANDO A PARQUET ===")
parquet_buffer = BytesIO()
df_cleaned.to_parquet(
parquet_buffer,
engine='pyarrow',
compression='snappy',
index=False
)
parquet_buffer.seek(0)
# Subir a S3
s3.upload_fileobj(parquet_buffer, bucket_name, output_key)
# Verificar subida
response = s3.head_object(Bucket=bucket_name, Key=output_key)
size_mb = response['ContentLength'] / (1024*1024)
# 4. Preparar respuesta
result = {
'statusCode': 200,
'body': json.dumps({
'message': 'Procesamiento completado exitosamente',
'input_file': f"s3://{bucket_name}/{input_key}",
'output_file': f"s3://{bucket_name}/{output_key}",
'rows_processed': df_cleaned.shape[0],
'columns': df_cleaned.shape[1],
'output_size_mb': round(size_mb, 2),
'duplicates_removed': initial_rows - df_cleaned.shape[0]
})
}
print("🎉 ¡Procesamiento completado!")
return result
except Exception as e:
print(f"❌ Error: {str(e)}")
return {
'statusCode': 500,
'body': json.dumps({
'error': 'Error procesando datos',
'message': str(e)
})
}
Evidencia de Lambda en Consola AWS¶
Architecture Diagram¶
Evidencia de Sparks¶
Jobs¶
Enviroment¶
Executors¶
In [ ]: